library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.4 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
#library(readr)
library(leaflet)
#library(mapboxapi)
#library(htmlwidgets)
#library(osmdata)
library(sf)
## Warning: package 'sf' was built under R version 4.1.2
## Linking to GEOS 3.9.1, GDAL 3.4.0, PROJ 8.1.1; sf_use_s2() is TRUE
#library(ggmap)
#library(magrittr)
#library(spatstat)
library(sp)
library(htmlwidgets)
library(webshot)
library(maps)
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
file_name <- "data/bluebikes_tripdata_2019.csv"
# read:
data_2019 <- read_csv(file_name, progress = FALSE)
## Rows: 2522771 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): start station name, end station name, usertype
## dbl (12): tripduration, start station id, start station latitude, start sta...
## dttm (2): starttime, stoptime
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_2019 <- data_2019 %>% rename("start_long" = "start station longitude",
"start_lat" = "start station latitude",
"end_long" = "end station longitude",
"end_lat" = "end station latitude",
"start_station_id" = "start station id",
"start_station_name" = "start station name",
"end_station_name" = "end station name",
"birth_year" = "birth year")
Exploring basic patterns in the data that might be interesting for further exploration
# subscribers vs. customers (single use)
data_2019 %>% filter(usertype == "Subscriber") %>% count() # returns 1988494
## # A tibble: 1 × 1
## n
## <int>
## 1 1988494
data_2019 %>% filter(usertype == "Customer") %>% count() # returns 534277
## # A tibble: 1 × 1
## n
## <int>
## 1 534277
# genders
data_2019 %>% filter(gender == 0) %>% count() # 277703 (female?)
## # A tibble: 1 × 1
## n
## <int>
## 1 277703
data_2019 %>% filter(gender == 1) %>% count() # 1652699 (male?)
## # A tibble: 1 × 1
## n
## <int>
## 1 1652699
data_2019 %>% filter(gender == 2) %>% count() # 592369 (other?)
## # A tibble: 1 × 1
## n
## <int>
## 1 592369
# age range
data_2019 %>% select(birth_year) %>% range() # return 1886 2003
## [1] 1886 2003
data_2019 %>% filter(birth_year == 1886) %>% count() # returns 3
## # A tibble: 1 × 1
## n
## <int>
## 1 3
# trip duration range
data_2019 %>% select(tripduration) %>% range() # 61 42567137
## [1] 61 42567137
There are many more subscribers which suggests that it’s mostly locals using the bikes.
Maybe it’s just me but it seems like there’s something off about the gender data, so I might not include that.
There is also something weird about the age data - there are three entries of 1888 and just in general quite old people so again, I might not use this information.
There is also a massive range in trip duration but that’s not necessarily an error
On the basis of this exploration I select the columns I want to use to reduce the strain on my computer when working with such large files
cleaned_2019 <- data_2019 %>% select(c(tripduration, starttime, stoptime, start_lat, start_long, start_station_name, end_lat, end_long, end_station_name, usertype, year, month))
cleaned_2019
## # A tibble: 2,522,771 × 12
## tripduration starttime stoptime start_lat start_long
## <dbl> <dttm> <dttm> <dbl> <dbl>
## 1 790 2019-12-01 00:01:25 2019-12-01 00:14:35 42.4 -71.1
## 2 166 2019-12-01 00:05:42 2019-12-01 00:08:29 42.4 -71.1
## 3 323 2019-12-01 00:08:28 2019-12-01 00:13:52 42.4 -71.1
## 4 709 2019-12-01 00:08:38 2019-12-01 00:20:27 42.4 -71.1
## 5 332 2019-12-01 00:10:08 2019-12-01 00:15:41 42.4 -71.1
## 6 507 2019-12-01 00:14:26 2019-12-01 00:22:53 42.4 -71.1
## 7 794 2019-12-01 00:17:46 2019-12-01 00:31:01 42.4 -71.1
## 8 1354 2019-12-01 00:18:19 2019-12-01 00:40:54 42.4 -71.1
## 9 1227 2019-12-01 00:19:01 2019-12-01 00:39:29 42.3 -71.1
## 10 1068 2019-12-01 00:24:41 2019-12-01 00:42:29 42.3 -71.1
## # … with 2,522,761 more rows, and 7 more variables: start_station_name <chr>,
## # end_lat <dbl>, end_long <dbl>, end_station_name <chr>, usertype <chr>,
## # year <dbl>, month <dbl>
Let’s make a map and plot the end and start stations as points. Let’s start by plotting each start station. I only want one entry pr. start station, so I need to find the unique stations
unique_start <- cleaned_2019 %>% select(c(start_long, start_lat, start_station_name)) %>% unique()
# there was a single outlier, so I remove it
unique_start <- unique_start[!(unique_start$start_long == 0.0000|unique_start$start_lat == 0.0000), ]
# other outliers
unique_start <- unique_start %>% filter(start_station_name != "BCBS Hingham")
unique_start <- unique_start %>% filter(start_station_name != "Main St at Beacon St")
I do the same for the end stations
unique_end <- cleaned_2019 %>% select(c(end_long, end_lat, end_station_name)) %>% unique()
# I do the same for the end stations
unique_end <- unique_end[!(unique_end$end_long == 0.0000|unique_end$end_lat == 0.000), ]
# another outlier
unique_end <- unique_end %>% filter(end_station_name != "BCBS Hingham")
Make two quick maps
m1 <- leaflet() %>% addTiles() %>%
setView(lng = -71.057083, lat = 42.361145, zoom = 12) %>%
addCircleMarkers(lng = unique_start$start_long, lat = unique_start$start_lat)
m1
m2 <- leaflet(unique_end) %>% addTiles() %>% setView(lng = -71.057083, lat = 42.361145, zoom = 12) %>% addCircleMarkers(lng = unique_end$end_long, lat = unique_end$end_lat)
m2
#saveWidget(m2, "out/2019_end_stations.html", selfcontained = FALSE)
The two maps look almost identical.
load Boston city boundary and neighborhoods
# neighborhoods
neighborhoods <- st_read("data/Boston_Neighborhoods/Boston_Neighborhoods.shp")
## Reading layer `Boston_Neighborhoods' from data source
## `/Users/mettehejberg/Documents/6. semester engelsk /Spatial Analytics/exam/data/Boston_Neighborhoods/Boston_Neighborhoods.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 26 features and 7 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 739715.3 ymin: 2908294 xmax: 812255.1 ymax: 2970086
## Projected CRS: NAD83 / Massachusetts Mainland (ftUS)
# reproject to get it on the map
projected_neighborhoods <- st_transform(neighborhoods, 4326)
Plot the neighborhoods and stations together start stations and neighborhoods
p_popup <- paste0("<strong>Neighborhood: </strong>", projected_neighborhoods$Name)
p2_popup <- paste0("<strong>Start station: </strong>", unique_start$start_station_name)
leaflet(projected_neighborhoods) %>%
addPolygons(
stroke = TRUE,
fillColor = "yellow",
fillOpacity = 0.8, smoothFactor = 0.5, # to make it look nicer
popup = p_popup) %>% # add popup
addTiles() %>%
setView(lng = -71.057083, lat = 42.361145, zoom = 11.5) %>%
addCircleMarkers(lng = unique_start$start_long,
lat = unique_start$start_lat,
popup = p2_popup)
end stations and neighborhoods
p_popup <- paste0("<strong>Neighborhood: </strong>", projected_neighborhoods$Name)
p3_popup <- paste0("<strong>End station: </strong>", unique_end$end_station_name)
leaflet(projected_neighborhoods) %>%
addPolygons(
stroke = TRUE,
fillColor = "yellow",
fillOpacity = 0.8, smoothFactor = 0.5, # to make it look nicer
popup = p_popup) %>% # add popup
addTiles() %>%
setView(lng = -71.057083, lat = 42.361145, zoom = 11.5) %>%
addCircleMarkers(lng = unique_end$end_long,
lat = unique_end$end_lat,
popup = p2_popup)
From these maps, we can get an idea of where in Boston are moving, both within the larger city boundary and within neighbors of the city. Now I want to use st_intersect or something on the neighborhoods and the points to find the most popular areas.
One thing to consider in the analysis of the most popular areas is that there are very small neighborhoods in central Boston and very large neighborhoods on the outskirts of the city. It is therefore difficult for a small neighborhood to have the same amount of bike stations as larger neighborhoods. However, on the other hand, from just looking at the maps, the central parts of the city are clearly the most popular, so perhaps this is not an issue.
To get the intersections of the points and the neighborhoods, I create a convex hull around the points. First I transform the longitude and latitude coordinates into a sf object.
# remove NAs if there is any
points <- unique_start %>%
filter(! is.na(start_long)) %>%
filter(! is.na(start_lat))
# transform points to sf object
points_sf <- st_as_sf(points, coords = c("start_long", "start_lat"), crs = 4326)
points_sf
## Simple feature collection with 406 features and 1 field
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: -71.16649 ymin: 42.2679 xmax: -71.0061 ymax: 42.4148
## Geodetic CRS: WGS 84
## # A tibble: 406 × 2
## start_station_name geometry
## * <chr> <POINT [°]>
## 1 Dartmouth St at Newbury St (-71.07783 42.35096)
## 2 MIT Stata Center at Vassar St / Main St (-71.09116 42.36213)
## 3 Inman Square at Springfield St. (-71.10016 42.37438)
## 4 Third at Binney (-71.08277 42.36544)
## 5 Verizon Innovation Hub 10 Ware Street (-71.11305 42.37251)
## 6 University Park (-71.10006 42.36265)
## 7 One Kendall Square at Hampshire St / Portland St (-71.09169 42.36628)
## 8 359 Broadway - Broadway at Fayette Street (-71.10441 42.3708)
## 9 Prudential Center - 101 Huntington Ave (-71.08066 42.34652)
## 10 Encore (-71.07245 42.39329)
## # … with 396 more rows
# transform its crs
points_3035 <- st_transform(points_sf, crs = 3035)
points_3035
## Simple feature collection with 406 features and 1 field
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: -902380.1 ymin: 5513497 xmax: -885856.2 ymax: 5529263
## Projected CRS: ETRS89-extended / LAEA Europe
## # A tibble: 406 × 2
## start_station_name geometry
## * <chr> <POINT [m]>
## 1 Dartmouth St at Newbury St (-892912 5521013)
## 2 MIT Stata Center at Vassar St / Main St (-892186.8 5522717)
## 3 Inman Square at Springfield St. (-891238.4 5524152)
## 4 Third at Binney (-891637.1 5522270)
## 5 Verizon Innovation Hub 10 Ware Street (-891770.1 5525035)
## 6 University Park (-892377.8 5523436)
## 7 One Kendall Square at Hampshire St / Portland St (-891797.9 5523009)
## 8 359 Broadway - Broadway at Fayette Street (-891702.1 5524264)
## 9 Prudential Center - 101 Huntington Ave (-893420.9 5520963)
## 10 Encore (-888647.6 5523156)
## # … with 396 more rows
# create convex hull
points_ch <- st_convex_hull(st_union(points_3035))
points_ch
## Geometry set for 1 feature
## Geometry type: POLYGON
## Dimension: XY
## Bounding box: xmin: -902380.1 ymin: 5513497 xmax: -885856.2 ymax: 5529263
## Projected CRS: ETRS89-extended / LAEA Europe
## POLYGON ((-899151.8 5513497, -901419.5 5517212,...
plot(points_ch)
class(points_ch)
## [1] "sfc_POLYGON" "sfc"
I have now created a polygon out of the start station points.
Let’s get the intersection between this polygon and the neighborhoods multipolygon
# make sure they have the same crs
neighborhoods_3035 <- st_transform(neighborhoods, crs = 3035)
# get the intersections
points_neighborhoods_intersection <- st_intersection(points_ch, st_geometry(neighborhoods_3035))
# create buffer feature of the neighborhoods
p_n_buff <- st_union(points_neighborhoods_intersection)
Plot the objects together
plot(points_ch)
plot(points_neighborhoods_intersection, border='red', lwd=2, add = TRUE)
plot(p_n_buff, add = TRUE)
The plot shows that the neighborhoods occupy a significant amount of space within the points polygon. However, there is also a large area that the neighborhoods do not cover. Let’s look at the map of the start stations and neighborhoods again to compare
p_popup <- paste0("<strong>Neighborhood: </strong>", projected_neighborhoods$Name)
p2_popup <- paste0("<strong>Start station: </strong>", unique_start$start_station_name)
leaflet(projected_neighborhoods) %>%
addPolygons(
stroke = TRUE,
fillColor = "yellow",
fillOpacity = 0.8, smoothFactor = 0.5, # to make it look nicer
popup = p_popup) %>% # add popup
addTiles() %>%
setView(lng = -71.057083, lat = 42.361145, zoom = 11.5) %>%
addCircleMarkers(lng = unique_start$start_long,
lat = unique_start$start_lat,
popup = p2_popup)
Looking at the map, we can confirm that towards the northern part of the city, the points occupy a space outside the neighborhoods. Furthermore, recall the points polygon that went down into a point, which is due to the single point located far outside the city to the south.
I do the same for the end stations to see if we find the same patterns.
# remove NAs if there is any
points_end <- unique_end %>%
filter(! is.na(end_long)) %>%
filter(! is.na(end_lat))
# transform the points into an sf object
points_end_sf <- st_as_sf(points_end, coords = c("end_long", "end_lat"), crs = 4326)
# transform their crs
points_end_3035 <- st_transform(points_end_sf, crs = 3035)
# create convex hull
points_end_ch <- st_convex_hull(st_union(points_end_3035))
# plot it
plot(points_end_ch)
class(points_end_ch)
## [1] "sfc_POLYGON" "sfc"
The polygon looks very much identical to the start stations polyogon
Let’s get the intersection
# get the intersections
points_neighborhoods_intersection_end <- st_intersection(points_end_ch, st_geometry(neighborhoods_3035))
# create buffer feature of the neighborhoods
p_n_buff_end <- st_union(points_neighborhoods_intersection_end)
Plot the objects together
plot(points_end_ch)
plot(points_neighborhoods_intersection_end, border='red', lwd=2, add = TRUE)
plot(p_n_buff_end, add = TRUE)
It looks very much the same for the end stations. Let’s look at the map of the end stations and the neighborhoods to compare
p_popup <- paste0("<strong>Neighborhood: </strong>", projected_neighborhoods$Name)
p3_popup <- paste0("<strong>End station: </strong>", unique_end$end_station_name)
leaflet(projected_neighborhoods) %>%
addPolygons(
stroke = TRUE,
fillColor = "yellow",
fillOpacity = 0.8, smoothFactor = 0.5, # to make it look nicer
popup = p_popup) %>% # add popup
addTiles() %>%
setView(lng = -71.057083, lat = 42.361145, zoom = 11.5) %>%
addCircleMarkers(lng = unique_end$end_long,
lat = unique_end$end_lat,
popup = p2_popup)
Once again, the map confirms what we saw in the plot. There is an area beyond the city boundary towards the north where there are a lot of points, and there is a single point towards the south far outside the city, which gives the polygon its point.
From now on, when I just want to plot the stations, I just plot the start stations because the end stations are very similar.
Let’s look at the 2020 data
file_name <- "data/bluebikes_tripdata_2020.csv"
# read:
data_2020 <- read_csv(file_name, progress = FALSE)
## Rows: 1999446 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): start station name, end station name, usertype, postal code
## dbl (12): tripduration, start station id, start station latitude, start sta...
## dttm (2): starttime, stoptime
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_2020 <- data_2020 %>% rename("start_long" = "start station longitude",
"start_lat" = "start station latitude",
"end_long" = "end station longitude",
"end_lat" = "end station latitude",
"start_station_id" = "start station id",
"start_station_name" = "start station name",
"end_station_name" = "end station name",
"birth_year" = "birth year")
Let’s do the same initial exploration of the data so see what we’re working with
# subscribers vs. customers (single use)
data_2020 %>% filter(usertype == "Subscriber") %>% count() # returns 1440363
## # A tibble: 1 × 1
## n
## <int>
## 1 1440363
data_2020 %>% filter(usertype == "Customer") %>% count() # returns 559083
## # A tibble: 1 × 1
## n
## <int>
## 1 559083
# much fewer subscribers much around the same single use
# genders
data_2020 %>% filter(gender == 0) %>% count() # 29691 (female?)
## # A tibble: 1 × 1
## n
## <int>
## 1 29691
data_2020 %>% filter(gender == 1) %>% count() # 291321 (male?)
## # A tibble: 1 × 1
## n
## <int>
## 1 291321
data_2020 %>% filter(gender == 2) %>% count() # 94964 (other?)
## # A tibble: 1 × 1
## n
## <int>
## 1 94964
# much more uniform data here
data_2020 %>% select(birth_year) %>% range() # there is no birth year information
## [1] NA NA
# trip duration range
data_2020 %>% select(tripduration) %>% range() # 61 3879352
## [1] 61 3879352
# there is also fewer rows in the 2020 dataset
Let’s clean the 2020 data
cleaned_2020 <- data_2020 %>% select(c(tripduration, starttime, stoptime, start_lat, start_long, start_station_name, end_lat, end_long, end_station_name, usertype, year, month))
cleaned_2020
## # A tibble: 1,999,446 × 12
## tripduration starttime stoptime start_lat start_long
## <dbl> <dttm> <dttm> <dbl> <dbl>
## 1 1793 2020-11-01 00:00:18 2020-11-01 00:30:12 42.3 -71.0
## 2 1832 2020-11-01 00:00:34 2020-11-01 00:31:07 42.3 -71.0
## 3 262 2020-11-01 00:01:54 2020-11-01 00:06:17 42.3 -71.0
## 4 419 2020-11-01 00:04:00 2020-11-01 00:10:59 42.4 -71.1
## 5 275 2020-11-01 00:04:01 2020-11-01 00:08:36 42.4 -71.1
## 6 684 2020-11-01 00:04:39 2020-11-01 00:16:03 42.3 -71.1
## 7 608 2020-11-01 00:04:43 2020-11-01 00:14:52 42.4 -71.1
## 8 438 2020-11-01 00:05:57 2020-11-01 00:13:15 42.4 -71.1
## 9 541 2020-11-01 00:07:15 2020-11-01 00:16:16 42.4 -71.1
## 10 1138 2020-11-01 00:07:17 2020-11-01 00:26:16 42.4 -71.1
## # … with 1,999,436 more rows, and 7 more variables: start_station_name <chr>,
## # end_lat <dbl>, end_long <dbl>, end_station_name <chr>, usertype <chr>,
## # year <dbl>, month <dbl>
I’ll only look at the start stations
unique_start_2020 <- cleaned_2020 %>% select(c(start_long, start_lat, start_station_name)) %>% unique()
# there was a single outlier, so I remove it
unique_start_2020 <- unique_start_2020[!(unique_start_2020$start_long == 0.0000|unique_start_2020$start_lat == 0.0000), ]
# another outlier
unique_start_2020 <- unique_start_2020 %>% filter(start_station_name != "BCBS Hingham")
Let’s plot the start stations with the neighborhoods
# plot the 2020 start stations
p_popup <- paste0("<strong>Neighborhood: </strong>", projected_neighborhoods$Name)
p5_popup <- paste0("<strong>Start station: </strong>", unique_start_2020$start_station_name)
leaflet(projected_neighborhoods) %>%
addPolygons(
stroke = TRUE,
fillColor = "yellow",
fillOpacity = 0.8, smoothFactor = 0.5, # to make it look nicer
popup = p_popup) %>% # add popup
addTiles() %>%
setView(lng = -71.057083, lat = 42.361145, zoom = 11.5) %>%
addCircleMarkers(lng = unique_start_2020$start_long,
lat = unique_start_2020$start_lat,
popup = p5_popup)
let’s look at the 2019 data for comparison
p2_popup <- paste0("<strong>Start station: </strong>", unique_start$start_station_name)
leaflet(projected_neighborhoods) %>%
addPolygons(
stroke = TRUE,
fillColor = "yellow",
fillOpacity = 0.8, smoothFactor = 0.5, # to make it look nicer
popup = p_popup) %>% # add popup
addTiles() %>%
setView(lng = -71.057083, lat = 42.361145, zoom = 11.5) %>%
addCircleMarkers(lng = unique_start$start_long,
lat = unique_start$start_lat,
popup = p2_popup)
We can see from the maps that there are areas/stations that aren’t in the 2019 data which are also outside of the city boundary. These are stations towards the west of the city boundary, beyond Brighton.
Let’s look at the intersection between the points and neighborhoods in 2020
# remove NAs if there is any
points_2020 <- unique_start_2020 %>%
filter(! is.na(start_long)) %>%
filter(! is.na(start_lat))
# transform the points into an sf object
points_2020_sf <- st_as_sf(points_2020, coords = c("start_long", "start_lat"), crs = 4326)
# transform their crs
points_2020_3035 <- st_transform(points_2020_sf, crs = 3035)
# create convex hull
points_2020_ch <- st_convex_hull(st_union(points_2020_3035))
# plot it
plot(points_2020_ch)
class(points_end_ch)
## [1] "sfc_POLYGON" "sfc"
# get the intersections
points_neighborhoods_intersection_2020 <- st_intersection(points_2020_ch, st_geometry(neighborhoods_3035))
# create buffer feature of the neighborhoods
p_n_buff_2020 <- st_union(points_neighborhoods_intersection_2020)
Plot the objects together
plot(points_2020_ch)
plot(points_neighborhoods_intersection_2020, border='red', lwd=2, add = TRUE)
plot(p_n_buff_2020, add = TRUE)
unique_start %>% select(start_station_name) %>% unique() %>% count() # 361
## # A tibble: 1 × 1
## n
## <int>
## 1 359
unique_start_2020 %>% select(start_station_name) %>% unique() %>% count() # 385
## # A tibble: 1 × 1
## n
## <int>
## 1 384
The code above shows that there are around the same unique start station names, with 2020 having 24 more unique start stations. On the one hand, it is surprising that there is a wider range of start stations in 2020, since most people in the world lived in highly restricted way due to Covid-19. However, at least in Denmark, this also led to people going outside with more intention, since the walk or bike ride to work was not a part of our daily lives anymore, and this pattern could perhaps also reflect a similar tendency.
Let’s find the most used start station in 2019 and 2020
# got the function from here: https://stackoverflow.com/questions/66787325/how-to-find-the-least-frequent-value-in-a-column-of-dataframe-in-r
Modemin <- function(x){
a = table(x) # x is a column
return(a[which.min(a)])
}
# least used start station in 2020: MTL-ECO4-01 (1 time)
Modemin(data_2020$start_station_name)
## MTL-ECO4-01
## 1
# least used start station in 2019: 8D QC Station 01 (1 time)
Modemin(data_2019$start_station_name)
## 8D QC Station 01
## 1
# lest used end station in 2020: Mobile Temporary Station 1 (1 time)
Modemin(data_2020$end_station_name)
## Mobile Temporary Station 1
## 1
# lest used end station in 2019: 8D QC Station 02 (1 time)
Modemin(data_2019$end_station_name)
## 8D QC Station 02
## 1
Modemax <- function(x){
a = table(x) # x is a column
return(a[which.max(a)])
}
# most used start station in 2019: MIT at Mass Ave / Amherst St (61056 times)
Modemax(data_2019$start_station_name)
## MIT at Mass Ave / Amherst St
## 61056
# most used start station in 2020: Central Square at Mass Ave / Essex St (32668 times )
Modemax(data_2020$start_station_name)
## Central Square at Mass Ave / Essex St
## 32668
# most used end station in 2019: MIT at Mass Ave / Amherst St (56986 times)
Modemax(data_2019$end_station_name)
## MIT at Mass Ave / Amherst St
## 56986
# most used end station in 2020: Central Square at Mass Ave / Essex St (33493 times)
Modemax(data_2020$end_station_name)
## Central Square at Mass Ave / Essex St
## 33493
So the most used stations in 2020 are used significantly less than the most used station in 2019. Interstingly, the most used start and end stations in 2019 is the same and the most used start and end station in 2020 is also the same.
Let’s find out where these stations are and plot them with leaflet
# most used station in 2019
data_2019 %>% filter(start_station_name == "MIT at Mass Ave / Amherst St") %>% select(c(start_long, start_lat))
## # A tibble: 61,056 × 2
## start_long start_lat
## <dbl> <dbl>
## 1 -71.1 42.4
## 2 -71.1 42.4
## 3 -71.1 42.4
## 4 -71.1 42.4
## 5 -71.1 42.4
## 6 -71.1 42.4
## 7 -71.1 42.4
## 8 -71.1 42.4
## 9 -71.1 42.4
## 10 -71.1 42.4
## # … with 61,046 more rows
leaflet(projected_neighborhoods) %>% addTiles() %>% setView(lng = -71.1, lat = 42.4, zoom = 12) %>% addCircleMarkers(lng = -71.1, lat = 42.4) %>%
addPolygons(stroke = TRUE,
fillColor = "yellow",
fillOpacity = 0.8, smoothFactor = 0.5, # to make it look nicer
popup = p_popup)
# most used station in 2020
data_2020 %>% filter(start_station_name == "Central Square at Mass Ave / Essex St") %>% select(c(start_long, start_lat))
## # A tibble: 32,668 × 2
## start_long start_lat
## <dbl> <dbl>
## 1 -71.1 42.4
## 2 -71.1 42.4
## 3 -71.1 42.4
## 4 -71.1 42.4
## 5 -71.1 42.4
## 6 -71.1 42.4
## 7 -71.1 42.4
## 8 -71.1 42.4
## 9 -71.1 42.4
## 10 -71.1 42.4
## # … with 32,658 more rows
As it turn out the two stations have the same longitude and latitude coordinates. Interestingly, the most used station is far outside the city boundary. From the station name in the 2019 data, it seems that this might be a station close to MIT, so let’s find MIT’s coordinates and plot it along side the point.
p_popup = paste0("<strong>Start station: </strong>", "MIT")
p_popup2 = paste0("<strong>institution: </strong>", "Central Square at Mass Ave / Essex St")
leaflet(projected_neighborhoods) %>% addTiles() %>% setView(lng = -71.1, lat = 42.4, zoom = 12) %>% addCircleMarkers(lng = -71.1, lat = 42.4, popup = p_popup2) %>%
addCircleMarkers(lng = -71.092003, lat = 42.360001, popup = p_popup, color = "red") %>%
addPolygons(stroke = TRUE,
fillColor = "yellow",
fillOpacity = 0.8, smoothFactor = 0.5)
So the central campus of MIT is not near the most used stations in 2019 and 2020, but it makes we think that plotting the different higher learning institutions located in Boston might yield interesting results seeing as many of the stations are clustered around this area.
Load colleges and universities
uni <- st_read("data/Colleges_and_Universities/Colleges_and_Universities.shp")
## Reading layer `Colleges_and_Universities' from data source
## `/Users/mettehejberg/Documents/6. semester engelsk /Spatial Analytics/exam/data/Colleges_and_Universities/Colleges_and_Universities.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 60 features and 28 fields
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: 742669.8 ymin: 2917826 xmax: 780798.5 ymax: 2964159
## Projected CRS: NAD83 / Massachusetts Mainland (ftUS)
# reproject to get it on the map
projected_uni <- st_transform(uni, 4326)
projected_uni <- projected_uni[!(projected_uni$Longitude == 0.0000|projected_uni$Latitude == 0.0000), ]
Plot the colleges and universities on a map together with the start stations from 2019 and 2020
p7_popup <- paste0("<strong>Institution: </strong>", projected_uni$Name)
leaflet() %>% addTiles() %>%
addCircleMarkers(lng = projected_uni$Longitude, lat = projected_uni$Latitude, color = "red", popup = p7_popup) %>%
addCircleMarkers(lng = unique_start$start_long, lat = unique_start$start_lat) %>% setView(lng = -71.1, lat = 42.35, zoom = 11.5)
leaflet() %>% addTiles() %>%
addCircleMarkers(lng = projected_uni$Longitude, lat = projected_uni$Latitude, color = "red", popup = p7_popup) %>%
addCircleMarkers(lng = unique_start_2020$start_long, lat = unique_start_2020$start_lat) %>% setView(lng = -71.1, lat = 42.35, zoom = 11.5)
It does in fact seem that many of the start station points are centered around the location of the colleges and universities.
Let’s make the colleges and universities into a polygon and plot it again
# remove NAs if there is any
points_uni <- projected_uni %>%
filter(! is.na(Longitude)) %>%
filter(! is.na(Latitude))
# transform points into sf object
points_uni_sf <- st_as_sf(points_uni, coords = c("Longitude", "Latitude"), crs = 4326)
# transform its crs
points_uni_3035 <- st_transform(points_uni_sf, crs = 3035)
# create convex hull
points_uni_ch <- st_convex_hull(st_union(points_uni_3035))
# plot it
plot(points_uni_ch)
# give it projected crs again
points_uni_ch_3035 <- st_transform(points_uni_ch, crs = 4326)
Let’s plot the colleges and universities polygon together the with 2019 start stations
leaflet(points_uni_ch_3035) %>% addTiles() %>%
addPolygons(color = "yellow") %>%
addCircleMarkers(lng = unique_start$start_long, lat = unique_start$start_lat) %>% setView(lng = -71.1, lat = 42.35, zoom = 11.5)
leaflet(points_uni_ch_3035) %>% addTiles() %>%
addPolygons(color = "yellow") %>%
addCircleMarkers(lng = unique_start_2020$start_long, lat = unique_start_2020$start_lat) %>% setView(lng = -71.1, lat = 42.35, zoom = 11.5)
So indeed, I does seem that the colleges and institutions take up a fairly large part of the space the points occupy. Let’s confirm this be getting the intersection between the points-polygon and the colleges and universities-polygon
# get the intersections
uni_points_intersection <- st_intersection(st_union(points_ch, points_uni_ch))
# create buffer feature of the neighborhoods
p_n_buff_uni <- st_union(uni_points_intersection)
Plot the objects together
plot(points_ch)
plot(uni_points_intersection, add = TRUE)
plot(points_uni_ch, border = "red", add = TRUE)
Compare with the neighborhoods and points intersection
plot(points_ch)
plot(points_neighborhoods_intersection, border='red', lwd=2, add = TRUE)
plot(p_n_buff, add = TRUE)
Indeed, the intersection of the colleges and universities occupy around the same amount of space as the neighborhoods. However, they also occupy the same space within the points-polygon, and I therefore find the intersection between the neighborhoods and the colleges and universities
neighborhoods_uni_intersection <- st_intersection(points_uni_ch, neighborhoods_3035)
plot(points_uni_ch)
plot(neighborhoods_uni_intersection, border="red", lwd=2, add = TRUE)
plot(p_n_buff, add = TRUE)
So, the neighborhoods and the learning institutions occupy much of the same space. Therefore, it is not possible to reliably confirm that the the users of the bikes are primarily seeking out the learning institutions. However, the overlay of the institutions on the points, did capture a real pattern of use just like the neighborhoods did.
Let’s look at the map with the most used start/end station again
p_popup = paste0("<strong>Start station: </strong>", "MIT")
p_popup2 = paste0("<strong>institution: </strong>", "Central Square at Mass Ave / Essex St")
# with slight changes to the view setting
leaflet(projected_neighborhoods) %>% addTiles() %>% setView(lng = -71.1, lat = 42.4, zoom = 15.5) %>% addCircleMarkers(lng = -71.1, lat = 42.4, popup = p_popup2) %>%
addPolygons(stroke = TRUE,
fillColor = "yellow",
fillOpacity = 0.8, smoothFactor = 0.5)
Here I overlay data from public and non-public schools
public <- st_read("data/Public_Schools/Public_Schools.shp")
## Reading layer `Public_Schools' from data source
## `/Users/mettehejberg/Documents/6. semester engelsk /Spatial Analytics/exam/data/Public_Schools/Public_Schools.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 131 features and 16 fields
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: 744291.9 ymin: 2910470 xmax: 790128.2 ymax: 2968120
## Projected CRS: NAD83 / Massachusetts Mainland (ftUS)
public <- st_transform(public, 4326)
public <- st_as_sf(public, crs = 4326)
public <- st_convex_hull(public$geometry)
# extract coordinates
public_coordinates <- st_coordinates(public)
class(public_coordinates)
## [1] "matrix" "array"
# transform to data frame
public_df <- public_coordinates %>%
as_tibble()
# I do the same for the non-public schools
non_public <- st_read("data/Non_Public_Schools/Non_Public_Schools.shp")
## Reading layer `Non_Public_Schools' from data source
## `/Users/mettehejberg/Documents/6. semester engelsk /Spatial Analytics/exam/data/Non_Public_Schools/Non_Public_Schools.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 82 features and 14 fields
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: 746614.7 ymin: 2912741 xmax: 791398.7 ymax: 2965487
## Projected CRS: NAD83 / Massachusetts Mainland (ftUS)
non_public <- st_transform(non_public, 4326)
non_public <- st_as_sf(non_public, crs = 4326)
non_public <- st_convex_hull(non_public$geometry)
# extract coordinates
non_public_coordinates <- st_coordinates(non_public)
class(non_public_coordinates)
## [1] "matrix" "array"
# transform to data frame
non_public_df <- non_public_coordinates %>%
as_tibble()
non_public_df <- non_public_coordinates %>% as_tibble()
# help if they need to be sf objects again: https://stackoverflow.com/questions/44214487/create-sf-object-from-two-column-matrix
Now we can plot the schools at points on the map alongside the start stations
p_popup <- paste0("Public")
p_popup2 <- paste0("Non-public")
leaflet() %>% addTiles() %>%
addCircleMarkers(lng = unique_start$start_long, lat = unique_start$start_lat) %>%
addCircleMarkers(lng = public_df$X, lat = public_df$Y, color = "red", popup = p_popup) %>%
addCircleMarkers(lng = non_public_df$X, lat = non_public_df$Y, color = "yellow", popup = p_popup2) %>% setView(lng = -71.1, lat = 42.35, zoom = 11.5)
This plot seems to reflect a usage pattern dominat in the lower part of the city. There are many overlaps of the points. Let’s transform the schools into a polygon and plot again. I do this by finding the collected shared space between the points
non_public_sf <- st_as_sf(non_public_df, coords = c("X", "Y"), crs = 4326)
class(non_public_sf)
## [1] "sf" "tbl_df" "tbl" "data.frame"
non_public_sf
## Simple feature collection with 82 features and 0 fields
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: -71.16578 ymin: 42.24013 xmax: -70.99945 ymax: 42.38436
## Geodetic CRS: WGS 84
## # A tibble: 82 × 1
## geometry
## * <POINT [°]>
## 1 (-71.10514 42.32767)
## 2 (-71.05872 42.30139)
## 3 (-71.13015 42.30658)
## 4 (-71.09574 42.34934)
## 5 (-71.13213 42.24426)
## 6 (-71.06087 42.32163)
## 7 (-71.10991 42.26222)
## 8 (-71.12682 42.25037)
## 9 (-71.07193 42.28944)
## 10 (-71.122 42.28169)
## # … with 72 more rows
non_public_sf <- st_transform(non_public_sf, 3035)
non_public_sf
## Simple feature collection with 82 features and 0 fields
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: -905296.7 ymin: 5514781 xmax: -887533.8 ymax: 5526907
## Projected CRS: ETRS89-extended / LAEA Europe
## # A tibble: 82 × 1
## geometry
## * <POINT [m]>
## 1 (-895919 5521715)
## 2 (-897213.5 5516538)
## 3 (-898647.9 5522373)
## 4 (-893555.8 5522298)
## 5 (-904763.7 5518759)
## 6 (-895304.3 5517929)
## 7 (-902414.1 5518126)
## 8 (-904025.8 5518718)
## 9 (-898735.2 5516836)
## 10 (-900848.8 5520239)
## # … with 72 more rows
non_public_ch <- st_convex_hull(st_union(non_public_sf))
plot(non_public_ch)
plot(non_public_ch)
public_sf <- non_public_coordinates %>%
as_tibble() %>%
sf::st_as_sf(coords=c(1,2), crs = 4326)
class(public_sf)
## [1] "sf" "tbl_df" "tbl" "data.frame"
public_sf <- st_transform(public_sf, 3035)
public_ch <- st_convex_hull(st_union(public_sf))
plot(public_ch)
schools <- st_convex_hull(st_union(non_public_ch, public_ch))
plot(schools)
Plot the schools with the stations points
plot(points_ch)
plot(schools, border = "yellow", lwd= 2, add = TRUE)
So the schools also take a around the same space that the neighborhoods did and the colleges and universities did
Instead of focusing on the most used stations, let’s find the top 40 (10%) most used stations, to see if there is a pattern there.
Top 36 stations in 2019
unique_start %>% select(start_station_name) %>% unique() %>% count() # 361
## # A tibble: 1 × 1
## n
## <int>
## 1 359
# get the station names
stations_2019 <- table(data_2019$start_station_name)
# convert to tibble
stations_2019_df <- as.data.frame(stations_2019) %>% as_tibble()
# get the indices in ascending order of frequency
order(stations_2019_df$Freq)
## [1] 13 237 233 20 355 232 300 347 82 123 236 150 86 88 246 247 160 345
## [19] 187 222 309 318 234 46 225 260 14 8 162 361 34 161 214 93 99 343
## [37] 342 155 315 5 69 306 51 133 154 258 311 322 196 330 33 92 85 39
## [55] 288 156 354 223 165 216 193 2 40 333 313 159 114 332 47 131 335 27
## [73] 274 102 146 41 145 157 310 104 316 334 182 26 340 360 273 242 254 98
## [91] 136 66 151 134 294 125 54 277 281 31 143 204 312 50 148 138 12 265
## [109] 287 9 329 73 344 68 140 135 249 121 262 43 231 314 348 15 10 336
## [127] 132 158 115 137 181 58 110 362 57 279 32 224 107 270 153 101 45 90
## [145] 139 350 127 28 87 217 124 351 67 352 198 18 266 72 243 280 245 152
## [163] 226 144 42 189 308 111 177 295 255 290 149 105 186 17 70 346 304 7
## [181] 141 49 23 219 363 213 235 211 197 16 220 286 64 221 202 194 303 184
## [199] 74 215 11 37 264 65 239 240 167 75 285 164 19 349 116 293 147 190
## [217] 326 267 188 3 195 206 94 71 299 261 112 250 106 324 283 251 191 212
## [235] 358 52 359 185 317 272 122 305 166 176 296 253 56 63 208 289 117 100
## [253] 113 292 323 302 268 169 179 327 109 103 142 38 337 4 168 339 259 35
## [271] 118 301 44 269 297 271 338 25 171 199 76 357 91 276 126 248 59 178
## [289] 1 356 320 95 257 328 77 319 307 175 128 275 78 21 203 79 325 81
## [307] 291 180 192 321 55 183 163 174 170 129 6 172 263 119 96 209 341 278
## [325] 130 201 29 24 282 256 207 60 53 210 244 284 62 238 205 331 353 48
## [343] 218 108 120 80 89 36 83 30 252 61 97 230 228 173 241 200 22 298
## [361] 229 84 227
# select the top 36
top36_2019 <- stations_2019_df[c(24, 282, 256, 207, 60, 53, 210, 244, 284, 62, 238, 205, 331, 353, 48, 218, 108, 120, 80, 89, 36, 83, 30, 252, 61, 97, 230, 228, 173, 241, 200, 22, 298, 229, 84, 227),]
# rename station name column
top36_2019 <- rename(top36_2019,
"start_station_name" = "Var1")
# merge data frame with longitude and latitude coordinates from data_2019
top36_2019 <- left_join(top36_2019, unique_start, by = "start_station_name")
# somehow I got 41 stations from this, but let's plot them anyway
leaflet() %>% addTiles() %>%
addCircleMarkers(lng = top36_2019$start_long, lat = top36_2019$start_lat)
Top 38 stations in 2020
unique_start_2020 %>% select(start_station_name) %>% unique() %>% count() # 385
## # A tibble: 1 × 1
## n
## <int>
## 1 384
stations_2020 <- table(data_2020$start_station_name)
stations_2020_df <- as.data.frame(stations_2020) %>% as_tibble()
order(stations_2020_df$Freq)
## [1] 253 33 288 49 287 208 382 215 95 134 34 184 232 364 30 227 381 349
## [19] 263 6 231 281 168 154 159 176 118 73 174 250 340 229 277 234 329 360
## [37] 2 48 25 238 92 367 375 257 75 18 265 158 362 138 320 365 94 311
## [55] 252 109 374 338 62 186 167 293 50 42 107 353 221 124 358 150 331 53
## [73] 239 13 337 286 88 166 292 99 259 245 36 43 24 334 307 378 149 74
## [91] 352 155 57 197 172 384 264 140 98 26 136 163 164 314 160 71 8 332
## [109] 170 142 205 70 244 296 147 335 199 129 191 192 300 330 350 380 361 10
## [127] 249 161 333 60 31 260 126 354 203 241 143 104 12 152 52 44 37 326
## [145] 267 315 141 72 279 363 144 91 80 371 139 273 128 304 76 289 54 298
## [163] 308 196 157 114 200 385 93 224 137 207 242 272 216 145 368 78 370 383
## [181] 226 228 77 7 282 256 61 235 306 131 240 283 162 32 262 366 299 165
## [199] 291 251 17 386 16 27 206 313 21 81 302 15 39 171 372 346 106 40
## [217] 379 201 69 321 225 271 59 14 87 68 322 3 194 148 369 46 187 110
## [235] 41 4 218 151 11 117 278 198 121 230 323 336 285 173 309 185 120 325
## [253] 209 122 220 105 175 319 23 119 204 116 169 294 328 156 276 189 324 101
## [271] 327 344 180 355 79 111 153 108 348 343 58 310 317 82 55 376 177 269
## [289] 86 9 47 290 84 133 146 341 268 183 100 179 305 312 237 83 357 195
## [307] 255 280 236 67 345 113 188 347 97 63 316 178 56 213 202 181 303 212
## [325] 339 66 377 359 28 284 275 115 29 130 19 342 211 1 217 193 45 51
## [343] 135 5 35 125 295 356 22 132 351 190 270 318 64 233 266 102 123 297
## [361] 214 301 373 112 261 210 219 85 89 223 222 254 258 248 274 247 182 38
## [379] 246 127 65 20 103 243 96 90
top_38_2020 <- stations_2020_df[c(90, 96, 243, 103, 20, 65, 127, 246, 38, 182, 247, 274, 248, 258, 254, 222, 223, 89, 85, 219, 210, 261, 112, 373, 301, 214, 297, 123, 102, 266, 233, 64, 318, 270, 190, 351, 132, 22, 356, 295),]
top_38_2020 <- rename(top_38_2020,
"start_station_name" = Var1)
top_38_2020 <- left_join(top_38_2020, unique_start_2020, by = "start_station_name")
# again, somehow I got 42 rows, but let's plot them
leaflet() %>% addTiles() %>%
addCircleMarkers(lng = top_38_2020$start_long, lat = top_38_2020$start_lat)
# draw a polygon out of the points and plot
It seems that the top stations of 2019 and 2020 overlap a lot, so let’s find their intersections
Find intersection between the two top stations-polygon
top_2019 <- st_as_sf(top36_2019, coords = c("start_long", "start_lat"), crs = 4326)
top_2019 <- st_transform(top_2019, crs = 3035)
top_2019_ch <- st_convex_hull(st_union(top_2019$geometry))
plot(top_2019_ch)
top_2020 <- st_as_sf(top_38_2020, coords = c("start_long", "start_lat"), crs = 4326)
top_2020 <- st_transform(top_2020, crs = 3035)
top_2020_ch <- st_convex_hull(st_union(top_2020$geometry))
plot(top_2020_ch)
top_stations <- st_intersection(st_union(top_2019_ch, top_2020_ch))
plot(top_stations, border = "yellow", lwd=2)
plot(top_2019_ch, border="red", lwd=2, add = TRUE)
plot(top_2020_ch, lwd=2, add = TRUE)
Let’s see how the top stations overlap with the different polygons we have created First, the neighborhoods
plot(top_stations)
plot(p_n_buff, border = "red", lwd=2, add = TRUE)
I need to select specific months and plot their points. I’ll look at June or July for tourist season. And then I’ll also look at the coldest month, January, for a comparison.
I first explore the number of data points in each month
cleaned_2019 %>% filter(month == 6) # June has 274,083 data points
## # A tibble: 274,083 × 12
## tripduration starttime stoptime start_lat start_long
## <dbl> <dttm> <dttm> <dbl> <dbl>
## 1 241 2019-06-01 00:00:20 2019-06-01 00:04:21 42.4 -71.1
## 2 758 2019-06-01 00:00:28 2019-06-01 00:13:06 42.4 -71.1
## 3 522 2019-06-01 00:00:48 2019-06-01 00:09:30 42.4 -71.1
## 4 474 2019-06-01 00:01:08 2019-06-01 00:09:02 42.4 -71.1
## 5 1289 2019-06-01 00:02:14 2019-06-01 00:23:43 42.3 -71.1
## 6 208 2019-06-01 00:03:43 2019-06-01 00:07:12 42.4 -71.1
## 7 297 2019-06-01 00:03:50 2019-06-01 00:08:48 42.4 -71.1
## 8 3886 2019-06-01 00:04:14 2019-06-01 01:09:00 42.3 -71.1
## 9 578 2019-06-01 00:04:16 2019-06-01 00:13:55 42.3 -71.1
## 10 141875 2019-06-01 00:04:29 2019-06-02 15:29:04 42.3 -71.1
## # … with 274,073 more rows, and 7 more variables: start_station_name <chr>,
## # end_lat <dbl>, end_long <dbl>, end_station_name <chr>, usertype <chr>,
## # year <dbl>, month <dbl>
cleaned_2019 %>% filter(month == 7) # July has 317,028 data points
## # A tibble: 317,028 × 12
## tripduration starttime stoptime start_lat start_long
## <dbl> <dttm> <dttm> <dbl> <dbl>
## 1 581 2019-07-01 00:00:41 2019-07-01 00:10:22 42.3 -71.1
## 2 600 2019-07-01 00:00:49 2019-07-01 00:10:50 42.3 -71.1
## 3 349 2019-07-01 00:05:34 2019-07-01 00:11:24 42.4 -71.1
## 4 29455 2019-07-01 00:05:36 2019-07-01 08:16:31 42.3 -71.1
## 5 363 2019-07-01 00:05:45 2019-07-01 00:11:48 42.3 -71.1
## 6 1769 2019-07-01 00:06:28 2019-07-01 00:35:58 42.4 -71.1
## 7 903 2019-07-01 00:06:31 2019-07-01 00:21:34 42.3 -71.1
## 8 787 2019-07-01 00:06:35 2019-07-01 00:19:42 42.3 -71.1
## 9 315 2019-07-01 00:07:20 2019-07-01 00:12:36 42.4 -71.1
## 10 661 2019-07-01 00:08:03 2019-07-01 00:19:05 42.3 -71.1
## # … with 317,018 more rows, and 7 more variables: start_station_name <chr>,
## # end_lat <dbl>, end_long <dbl>, end_station_name <chr>, usertype <chr>,
## # year <dbl>, month <dbl>
# August and September has the most data points so that is also worth exploring
cleaned_2019 %>% filter(month == 1) # January has the least, 69,872
## # A tibble: 69,872 × 12
## tripduration starttime stoptime start_lat start_long
## <dbl> <dttm> <dttm> <dbl> <dbl>
## 1 371 2019-01-01 00:09:13 2019-01-01 00:15:25 42.4 -71.1
## 2 264 2019-01-01 00:33:56 2019-01-01 00:38:20 42.4 -71.1
## 3 458 2019-01-01 00:41:54 2019-01-01 00:49:33 42.4 -71.1
## 4 364 2019-01-01 00:43:32 2019-01-01 00:49:37 42.4 -71.1
## 5 681 2019-01-01 00:49:56 2019-01-01 01:01:17 42.4 -71.1
## 6 549 2019-01-01 00:50:01 2019-01-01 00:59:10 42.4 -71.1
## 7 304 2019-01-01 00:54:48 2019-01-01 00:59:53 42.4 -71.1
## 8 425 2019-01-01 01:00:48 2019-01-01 01:07:53 42.3 -71.1
## 9 1353 2019-01-01 01:03:34 2019-01-01 01:26:07 42.4 -71.1
## 10 454 2019-01-01 01:08:56 2019-01-01 01:16:30 42.3 -71.1
## # … with 69,862 more rows, and 7 more variables: start_station_name <chr>,
## # end_lat <dbl>, end_long <dbl>, end_station_name <chr>, usertype <chr>,
## # year <dbl>, month <dbl>
Seeing as I know that it’s mainly locals using the bikes, I’ll stick to looking at either June or July as that seems to be peak tourist season, and I therefore expect to see the biggest differences in patterns in these months.
Let’s look at July!
July_2019 <- cleaned_2019 %>% filter(month == 7)
July_2019 <- July_2019 %>% select(c(start_long, start_lat, start_station_name)) %>% unique()
# Remove outlier if it is there
July_2019 <- July_2019[!(July_2019$start_long == 0.0000|July_2019$start_lat == 0.0000), ]
# another outlier
July_2019 <- July_2019 %>% filter(start_station_name != "BCBS Hingham")
p_popup <- paste0("<strong>Neighborhood: </strong>", projected_neighborhoods$Name)
p3_popup <- paste0("<strong>Start station: </strong>", July_2019$start_station_name)
leaflet(projected_neighborhoods) %>%
addPolygons(
stroke = TRUE,
fillColor = "yellow",
fillOpacity = 0.8, smoothFactor = 0.5, # to make it look nicer
popup = p_popup) %>% # add popup
addTiles() %>%
setView(lng = -71.077083, lat = 42.341145, zoom = 11.5) %>%
addCircleMarkers(lng = July_2019$start_long,
lat = July_2019$start_lat,
popup = p3_popup)
Let’s look at January!
January_2019 <- cleaned_2019 %>% filter(month == 1)
January_2019 <- January_2019 %>% select(c(start_long, start_lat, start_station_name)) %>% unique()
# Remove outlier if it is there
January_2019 <- January_2019[!(January_2019$start_long == 0.0000|January_2019$start_lat == 0.0000), ]
p_popup <- paste0("<strong>Neighborhood: </strong>", projected_neighborhoods$Name)
p4_popup <- paste0("<strong>Start station: </strong>", January_2019$start_station_name)
leaflet(projected_neighborhoods) %>%
addPolygons(
stroke = TRUE,
fillColor = "yellow",
fillOpacity = 0.8, smoothFactor = 0.5, # to make it look nicer
popup = p_popup) %>% # add popup
addTiles() %>%
setView(lng = -71.077083, lat = 42.341145, zoom = 11.5) %>%
addCircleMarkers(lng = January_2019$start_long,
lat = January_2019$start_lat,
popup = p4_popup)
We can clearly see that there are fewer points in January, but other than that, it’s difficult to say anything from this map. So I’ll have to think about that. Now I’ll find the intersection between the neighborhoods and the points in July and January
First, July!
points_July <- July_2019 %>%
filter(! is.na(start_long)) %>%
filter(! is.na(start_lat))
points_July_sf <- st_as_sf(points_July, coords = c("start_long", "start_lat"), crs = 4326)
points_July_3035 <- st_transform(points_July_sf, crs = 3035)
# create convex hull
points_July_ch <- st_convex_hull(st_union(points_July_3035))
Get the intersections
# get the intersections
points_neighborhoods_intersection <- st_intersection(points_July_ch, st_geometry(neighborhoods_3035))
# create buffer feature of the neighborhoods
p_n_buff_July <- st_union(points_neighborhoods_intersection)
Plot the objects together
plot(points_July_ch)
plot(points_neighborhoods_intersection, border='red', lwd=2, add = TRUE)
plot(p_n_buff_July, add = TRUE)
Again, looks very much like the general one I’ve created
Now, January!
points_January <- January_2019 %>%
filter(! is.na(start_long)) %>%
filter(! is.na(start_lat))
points_January_sf <- st_as_sf(points_January, coords = c("start_long", "start_lat"), crs = 4326)
points_January_3035 <- st_transform(points_January_sf, crs = 3035)
# create convex hull
points_January_ch <- st_convex_hull(st_union(points_January_3035))
Get the intersections
# get the intersections
points_neighborhoods_intersection <- st_intersection(points_January_ch, st_geometry(neighborhoods_3035))
# create buffer feature of the neighborhoods
p_n_buff_January <- st_union(points_neighborhoods_intersection)
Plot the objects together
plot(points_January_ch)
plot(points_neighborhoods_intersection, border='red', lwd=2, add = TRUE)
plot(p_n_buff_January, add = TRUE)
The points are much more centered around the central part of Boston in January. In fact, almost the entire area the points cover, is within the neighborhoods, which is very different what what we have seen so far. this is the first finding that is different and something to mention in the report
To sum up so far. The end and start station maps look very similar. There is a clear different between the July map and the January map, where it seems that the points in January are more centralized and they are more dispersed in July. This is further supported using the st_intersection function, where much more of the polygon, created from the January points, are covered by the neighborhoods. Therefore, the users of the bikes do not travel as far outside of the city in January than they do in July.
January_2020 <- cleaned_2020 %>% filter(month == 1)
January_2020 <- January_2020 %>% select(c(start_long, start_lat, start_station_name)) %>% unique()
January_2020 <- January_2020[!(January_2020$start_long == 0.0000|January_2020$start_lat == 0.0000),]
p_popup <- paste0("<strong>Neighborhood: </strong>", projected_neighborhoods$Name)
p1_popup <- paste0("<strong>Station: </strong>", January_2020$start_station_name)
leaflet(projected_neighborhoods) %>% addTiles() %>%
addPolygons(stroke = TRUE,
fillColor = "yellow",
fillOpacity = 0.8, smoothFactor = 0.5, # to make it look nicer
popup = p_popup) %>%
addCircleMarkers(lng = January_2020$start_long, lat = January_2020$start_lat, popup = p1_popup) %>% setView(lng = -71.077083, lat = 42.341145, zoom = 11.5)
July_2020 <- cleaned_2020 %>% filter(month == 7)
July_2020 <- July_2020 %>% select(c(start_long, start_lat, start_station_name)) %>% unique()
July_2020 <- July_2020[!(July_2020$start_long == 0.0000|July_2020$start_lat == 0.0000),]
July_2020 <- July_2020 %>% filter(start_station_name != "BCBS Hingham")
p_popup <- paste0("<strong>Neighborhood: </strong>", projected_neighborhoods$Name)
p1_popup <- paste0("<strong>Station: </strong>", July_2020$start_station_name)
leaflet(projected_neighborhoods) %>% addTiles() %>%
addPolygons(stroke = TRUE,
fillColor = "yellow",
fillOpacity = 0.8, smoothFactor = 0.5, # to make it look nicer
popup = p_popup) %>%
addCircleMarkers(lng = July_2020$start_long, lat = July_2020$start_lat, popup = p1_popup) %>% setView(lng = -71.077083, lat = 42.341145, zoom = 11.5)
points_July_2020 <- July_2020 %>%
filter(! is.na(start_long)) %>%
filter(! is.na(start_lat))
points_July_sf_2020 <- st_as_sf(points_July_2020, coords = c("start_long", "start_lat"), crs = 4326)
points_July_3035_2020 <- st_transform(points_July_sf_2020, crs = 3035)
# create convex hull
points_July_ch_2020 <- st_convex_hull(st_union(points_July_3035_2020))
Get the intersections
# get the intersections
points_neighborhoods_intersection_July_2020 <- st_intersection(points_July_ch_2020, st_geometry(neighborhoods_3035))
# create buffer feature of the neighborhoods
p_n_buff_July_2020 <- st_union(points_neighborhoods_intersection_July_2020)
Plot the objects together
plot(points_July_ch_2020)
plot(points_neighborhoods_intersection_July_2020, border='red', lwd=2, add = TRUE)
plot(p_n_buff_July_2020, add = TRUE)
points_January_2020 <- January_2020 %>%
filter(! is.na(start_long)) %>%
filter(! is.na(start_lat))
points_January_sf_2020 <- st_as_sf(points_January_2020, coords = c("start_long", "start_lat"), crs = 4326)
points_January_3035_2020 <- st_transform(points_January_sf_2020, crs = 3035)
# create convex hull
points_January_ch_2020 <- st_convex_hull(st_union(points_January_3035_2020))
Get the intersections
# get the intersections
points_neighborhoods_intersection_January_2020 <- st_intersection(points_January_ch_2020, st_geometry(neighborhoods_3035))
# create buffer feature of the neighborhoods
p_n_buff_January_2020 <- st_union(points_neighborhoods_intersection_January_2020)
Plot the objects together
plot(points_January_ch_2020)
plot(points_neighborhoods_intersection_January_2020, border='red', lwd=2, add = TRUE)
plot(p_n_buff_January_2020, add = TRUE)
Let’s see how the point patterns look at the peak of lockdown in Boston in 2020. From wikipedia, I can gather that the lockdown started from the end of March 2020, and the code below shows that the least data points are found in April
data_2020 %>% filter(month == 4) # 46,793
## # A tibble: 46,793 × 18
## tripduration starttime stoptime start_station_id
## <dbl> <dttm> <dttm> <dbl>
## 1 297 2020-04-01 00:12:27 2020-04-01 00:17:25 67
## 2 152 2020-04-01 00:18:18 2020-04-01 00:20:50 374
## 3 520 2020-04-01 00:18:21 2020-04-01 00:27:01 59
## 4 3917 2020-04-01 00:23:08 2020-04-01 01:28:26 97
## 5 1110 2020-04-01 00:31:45 2020-04-01 00:50:16 385
## 6 1074 2020-04-01 00:59:10 2020-04-01 01:17:04 149
## 7 214 2020-04-01 00:59:58 2020-04-01 01:03:32 54
## 8 939 2020-04-01 01:11:25 2020-04-01 01:27:04 66
## 9 776 2020-04-01 01:40:55 2020-04-01 01:53:51 66
## 10 716 2020-04-01 02:08:43 2020-04-01 02:20:39 76
## # … with 46,783 more rows, and 14 more variables: start_station_name <chr>,
## # start_lat <dbl>, start_long <dbl>, end station id <dbl>,
## # end_station_name <chr>, end_lat <dbl>, end_long <dbl>, bikeid <dbl>,
## # usertype <chr>, postal code <chr>, year <dbl>, month <dbl>,
## # birth_year <dbl>, gender <dbl>
# compared with 2019's slowest month, January with 69,872 data points
Let’s plot the April 2020 data
April_2020 <- cleaned_2020 %>% filter(month == 4)
April_2020 <- April_2020 %>% select(c(start_long, start_lat, start_station_name)) %>% unique()
# Remove outlier if it is there
April_2020 <- April_2020[!(April_2020$start_long == 0.0000|April_2020$start_lat == 0.0000), ]
p_popup <- paste0("<strong>Neighborhood: </strong>", projected_neighborhoods$Name)
p6_popup <- paste0("<strong>Start station: </strong>", April_2020$start_station_name)
leaflet(projected_neighborhoods) %>%
addPolygons(
stroke = TRUE,
fillColor = "yellow",
fillOpacity = 0.8, smoothFactor = 0.5, # to make it look nicer
popup = p_popup) %>% # add popup
addTiles() %>%
setView(lng = -71.077083, lat = 42.341145, zoom = 11.5) %>%
addCircleMarkers(lng = April_2020$start_long,
lat = April_2020$start_lat,
popup = p4_popup)
data_2020 %>% select(c(month, start_station_name)) %>% filter(month == 4) %>% select(start_station_name) %>% unique() # 329 unique stations names in April 2020, which means that there are fewer unique stations than in January 2019, but probably not significantly so
## # A tibble: 326 × 1
## start_station_name
## <chr>
## 1 MIT at Mass Ave / Amherst St
## 2 Tremont St at Hamilton Pl
## 3 Chinatown Gate Plaza
## 4 Harvard University River Houses at DeWolfe St / Cowperthwaite St
## 5 Albany St at E. Brookline St
## 6 175 N Harvard St
## 7 Tremont St at West St
## 8 Commonwealth Ave at Griggs St
## 9 Central Sq Post Office / Cambridge City Hall at Mass Ave / Pleasant St
## 10 Gallivan Blvd at Adams St
## # … with 316 more rows
# let's compare the April 2020 map with the January 2019 map
p4_popup <- paste0("<strong>Start station: </strong>", January_2019$start_station_name)
leaflet(projected_neighborhoods) %>%
addPolygons(
stroke = TRUE,
fillColor = "yellow",
fillOpacity = 0.8, smoothFactor = 0.5, # to make it look nicer
popup = p_popup) %>% # add popup
addTiles() %>%
setView(lng = -71.077083, lat = 42.341145, zoom = 11.5) %>%
addCircleMarkers(lng = January_2019$start_long,
lat = January_2019$start_lat,
popup = p4_popup)
points_April_2020 <- April_2020 %>%
filter(! is.na(start_long)) %>%
filter(! is.na(start_lat))
points_April_sf_2020 <- st_as_sf(points_April_2020, coords = c("start_long", "start_lat"), crs = 4326)
points_April_3035_2020 <- st_transform(points_April_sf_2020, crs = 3035)
# create convex hull
points_April_ch_2020 <- st_convex_hull(st_union(points_April_3035_2020))
Get the intersections
# get the intersections
points_neighborhoods_intersection_April_2020 <- st_intersection(points_April_ch_2020, st_geometry(neighborhoods_3035))
# create buffer feature of the neighborhoods
p_n_buff_April_2020 <- st_union(points_neighborhoods_intersection_April_2020)
Plot the objects together
plot(points_April_ch_2020)
plot(points_neighborhoods_intersection_April_2020, border='red', lwd=2, add = TRUE)
plot(p_n_buff_April_2020, add = TRUE)
The points are much more centered towards to central part of Boston in January 2019 than in April 2020, where there are many more points in the southern part of the city.
compare intersection of April 2020 and January 2020
unique_end %>% select(end_station_name) %>% unique() %>% count() # 360
## # A tibble: 1 × 1
## n
## <int>
## 1 360
# get the station names
stations_2019 <- table(data_2019$end_station_name)
# convert to tibble
stations_2019_df <- as.data.frame(stations_2019) %>% as_tibble()
# get the indices in ascending order of frequency
order(stations_2019_df$Freq)
## [1] 13 237 341 233 20 232 356 348 300 82 86 88 123 150 160 236 346 247
## [19] 187 246 234 318 309 225 222 46 155 34 14 69 260 315 344 362 214 8
## [37] 93 162 99 156 161 343 133 154 5 306 51 258 322 355 288 193 33 330
## [55] 165 196 39 85 311 92 313 40 332 333 47 216 223 102 114 131 146 121
## [73] 157 335 27 159 145 104 294 41 242 182 316 274 334 310 340 98 361 273
## [91] 26 136 125 66 254 151 134 54 2 143 277 349 31 314 281 138 204 50
## [109] 148 312 12 140 287 231 9 265 68 135 345 329 73 43 262 110 249 10
## [127] 15 336 132 158 58 67 137 181 115 32 57 279 347 363 107 224 270 295
## [145] 101 45 90 139 280 111 153 144 352 217 87 351 255 266 353 127 70 42
## [163] 198 226 124 152 243 18 177 189 186 245 290 308 49 211 17 37 72 286
## [181] 197 213 304 364 141 219 23 64 105 7 19 235 16 220 149 28 293 202
## [199] 75 221 285 184 303 194 11 215 240 188 65 167 74 164 112 350 261 239
## [217] 147 116 267 326 3 94 195 113 250 206 71 212 360 264 283 324 317 251
## [235] 106 190 52 191 185 359 122 299 176 253 305 339 296 63 272 323 208 302
## [253] 169 292 179 289 109 56 100 327 268 38 142 118 103 337 297 117 4 168
## [271] 44 35 269 259 301 271 166 91 171 126 276 199 76 338 178 358 357 77
## [289] 248 163 320 1 319 59 257 25 95 175 6 192 307 328 79 21 78 275
## [307] 203 183 325 291 128 321 81 180 174 263 342 172 55 96 119 256 129 170
## [325] 29 201 278 282 207 24 130 284 331 60 53 244 209 62 210 238 205 218
## [343] 48 354 108 80 89 36 252 120 30 83 61 97 230 200 228 173 298 229
## [361] 241 22 84 227
# select the top 36
top36_2019_end <- stations_2019_df[c(207, 24, 130, 284, 331, 60, 53, 244, 209, 62, 210, 238, 205, 218, 48, 354, 108, 80, 89, 36, 252, 120, 30, 83, 61, 97, 230, 200, 228, 173, 298, 229, 241, 22, 84, 227),]
# rename station name column
top36_2019_end <- rename(top36_2019_end,
"end_station_name" = "Var1")
# merge data frame with longitude and latitude coordinates from data_2019
top36_2019_end <- left_join(top36_2019_end, unique_end, by = "end_station_name")
coords_start_2019 <- top36_2019 %>% st_as_sf(coords = c("start_long", "start_lat"), crs = 4326)
coords_end_2019 <- top36_2019_end %>% st_as_sf(coords = c("end_long", "end_lat"), crs = 4326)
#lines <- st_sfc(mapply(function(a,b){
# st_cast(st_union(a,b),"LINESTRING")},
#coords_start_2019$geometry, coords_end_2019$geometry, SIMPLIFY=FALSE))
# https://stackoverflow.com/questions/65498300/how-to-efficiently-create-linestrings-from-points